app_proc_time
columndata_path <- "/Users/adityodasgupta/Documents/McGill/ORGB/672_project_data/"
applications <- read_parquet(paste0(data_path,"app_data_sample.parquet"))
edges <- read_csv(paste0(data_path,"edges_sample.csv"))
## Rows: 32906 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): application_number
## dbl (2): ego_examiner_id, alter_examiner_id
## date (1): advice_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
applications
edges
#install_genderdata_package() # only run this line the first time you use the package, to get data for it
# get a list of first names without repetitions
examiner_names <- applications %>%
distinct(examiner_name_first)
#Now let's use function `gender()` as shown in the example for the package to attach a gender and probability to each name and put the results into the table `examiner_names_gender`
# get a table of names and gender
examiner_names_gender <- examiner_names %>%
do(results = gender(.$examiner_name_first, method = "ssa")) %>%
unnest(cols = c(results), keep_empty = TRUE) %>%
select(
examiner_name_first = name,
gender,
proportion_female
)
examiner_names_gender
# Finally, let's join that table back to our original applications data and discard the temporary tables we have just created to reduce clutter in our environment.
# remove extra columns from the gender table
examiner_names_gender <- examiner_names_gender %>%
select(examiner_name_first, gender)
# joining gender back to the dataset
applications <- applications %>%
left_join(examiner_names_gender, by = "examiner_name_first")
# cleaning up
rm(examiner_names)
rm(examiner_names_gender)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4840433 258.6 8092220 432.2 NA 5198797 277.7
## Vcells 50504360 385.4 96616289 737.2 16384 80820144 616.7
examiner_surnames <- applications %>%
select(surname = examiner_name_last) %>%
distinct()
examiner_surnames
examiner_race <- predict_race(voter.file = examiner_surnames, surname.only = T) %>%
as_tibble()
## Warning: Unknown or uninitialised column: `state`.
## Proceeding with last name predictions...
## ℹ All local files already up-to-date!
## 701 (18.4%) individuals' last names were not matched.
examiner_race
examiner_race <- examiner_race %>%
mutate(max_race_p = pmax(pred.asi, pred.bla, pred.his, pred.oth, pred.whi)) %>%
mutate(race = case_when(
max_race_p == pred.asi ~ "Asian",
max_race_p == pred.bla ~ "black",
max_race_p == pred.his ~ "Hispanic",
max_race_p == pred.oth ~ "other",
max_race_p == pred.whi ~ "white",
TRUE ~ NA_character_
))
examiner_race
# Let's join the data back to the applications table.
# removing extra columns
examiner_race <- examiner_race %>%
select(surname,race)
applications <- applications %>%
left_join(examiner_race, by = c("examiner_name_last" = "surname"))
rm(examiner_race)
rm(examiner_surnames)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4910516 262.3 8092220 432.2 NA 7553737 403.5
## Vcells 54895477 418.9 96616289 737.2 16384 96616180 737.2
examiner_dates <- applications %>%
select(examiner_id, filing_date, appl_status_date)
examiner_dates
examiner_dates <- examiner_dates %>%
mutate(start_date = ymd(filing_date), end_date = as_date(dmy_hms(appl_status_date)))
examiner_dates <- examiner_dates %>%
group_by(examiner_id) %>%
summarise(
earliest_date = min(start_date, na.rm = TRUE),
latest_date = max(end_date, na.rm = TRUE),
tenure_days = interval(earliest_date, latest_date) %/% days(1)
) %>%
filter(year(latest_date)<2018)
examiner_dates
# Joining back to the applications data.
applications <- applications %>%
left_join(examiner_dates, by = "examiner_id")
rm(examiner_dates)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4916940 262.6 14380074 768.0 NA 14380074 768.0
## Vcells 65271493 498.0 139303455 1062.9 16384 139303176 1062.8
# Remove Nas from status date and gender
applications <- applications %>%
filter(!is.na(appl_status_date) | !is.na(gender) | !is.na(race))
# Clean Date format
#get the date format cleaned
applications$Date_time=as.Date(applications$appl_status_date, format="%d%b%Y")
#get the date format for the filing date cleaned
applications$filing_date=as.Date(applications$filing_date, format="%d%b%Y")
#Remove all the data we will not need based on application status
exclude_list=c("PEND")
applications <- applications %>%
filter(!disposal_type %in% exclude_list)
#Setting Gender as factor
applications$gender = as.factor(applications$gender)
#Setting ethnicity as factor
applications$race = as.factor(applications$race)
#Setting disposal type as factor
applications$disposal_type = as.factor(applications$disposal_type)
#setting the technology center as a factor
applications$tc = as.factor(applications$tc)
#this is the amount of time in days that the applications take
applications$app_proc_time <- applications$Date_time - applications$filing_date
applications$app_proc_time <- as.numeric(applications$app_proc_time)
##Nodes & Edges Creation First we need to create the netwrok data to calculate centrality We will remove any records that contain NAs to avoid future issues with coding
#Create the edges from edge data
edges_backup=edges
#edges=edges_backup
edges <- edges %>%
mutate(from=ego_examiner_id,to=alter_examiner_id) %>%
select(from, to) %>%
drop_na()
#Create Nodes from Edges Data
nodes <-as.data.frame(do.call(rbind,append(as.list(edges$from),as.list(edges$to))))
nodes <- nodes %>%
mutate(id=V1) %>%
select(id) %>%
distinct(id) %>%
drop_na()
We will now add 3 closeness measures to the nodes data frame:
1.Degree Centrality: The number of connections (or edges) that each node has. 2. Closness Centrality : A measure that calculates the ability to spread information efficiently via the edges the node is connected to. It is calculated as the inverse of the average shortest path between nodes. 3: Betweenness Centrality: A measure that detects a node’s influence over the flow of information within a graph.
g <- igraph::graph_from_data_frame(edges, vertices = nodes) %>% as_tbl_graph(directed=TRUE)
#not sure why this isnt working
#g = tbl_graph(nodes = nodes, edges = edges, directed = FALSE)
g <- g %>%
activate(nodes) %>%
mutate(degree_cen = centrality_degree(),
closeness_cen = centrality_closeness(),
betweenness_cen = centrality_betweenness()) %>%
activate(edges)
tg_nodes <-
g %>%
activate(nodes) %>%
data.frame() %>%
mutate(name=as.integer(name))
nodes <- nodes %>%
left_join(tg_nodes,by=c("id"="name"))
remove(g,tg_nodes)
Time to visualise the degree centralities and numeric data
final_data <- applications %>%
left_join(nodes,by=c("examiner_id"="id"))
net <- igraph::graph_from_data_frame(edges, vertices = nodes) %>% as_tbl_graph(directed=TRUE)
plot(net, edge.arrow.size=.4,vertex.label=NA,vertex.size=4,vertex.color="blue",
edge.color="green")
# Degree centrality linear regression model
model_degree <- lm(app_proc_time ~ degree_cen + gender + race + tenure_days, data = final_data)
# Betweenness centrality linear regression model
model_betweenness <- lm(app_proc_time ~ betweenness_cen + gender + race + tenure_days, data = final_data)
# Closeness centrality linear regression model
model_closeness <- lm(app_proc_time ~ closeness_cen + gender + race + tenure_days, data = final_data)
# Display the model summaries
summary(model_degree)
##
## Call:
## lm(formula = app_proc_time ~ degree_cen + gender + race + tenure_days,
## data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1575.7 -662.8 -280.2 327.6 4727.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 777.94239 9.70831 80.132 < 2e-16 ***
## degree_cen -0.30643 0.04044 -7.578 3.51e-14 ***
## gendermale -4.01200 2.32959 -1.722 0.085035 .
## raceblack -56.75960 6.00087 -9.459 < 2e-16 ***
## raceHispanic 64.06730 7.64724 8.378 < 2e-16 ***
## raceother 122.21968 31.55291 3.873 0.000107 ***
## racewhite -6.13950 2.43558 -2.521 0.011710 *
## tenure_days 0.12773 0.00157 81.357 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1017 on 906469 degrees of freedom
## (782240 observations deleted due to missingness)
## Multiple R-squared: 0.007769, Adjusted R-squared: 0.007761
## F-statistic: 1014 on 7 and 906469 DF, p-value: < 2.2e-16
summary(model_betweenness)
##
## Call:
## lm(formula = app_proc_time ~ betweenness_cen + gender + race +
## tenure_days, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1767.4 -662.3 -280.1 326.8 4723.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.713e+02 9.616e+00 80.217 < 2e-16 ***
## betweenness_cen 2.740e-03 1.631e-04 16.797 < 2e-16 ***
## gendermale -4.854e+00 2.330e+00 -2.083 0.0372 *
## raceblack -5.345e+01 5.999e+00 -8.911 < 2e-16 ***
## raceHispanic 6.729e+01 7.648e+00 8.799 < 2e-16 ***
## raceother 1.273e+02 3.155e+01 4.036 5.43e-05 ***
## racewhite -4.723e+00 2.435e+00 -1.939 0.0525 .
## tenure_days 1.279e-01 1.564e-03 81.755 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1017 on 906469 degrees of freedom
## (782240 observations deleted due to missingness)
## Multiple R-squared: 0.008015, Adjusted R-squared: 0.008007
## F-statistic: 1046 on 7 and 906469 DF, p-value: < 2.2e-16
summary(model_closeness)
##
## Call:
## lm(formula = app_proc_time ~ closeness_cen + gender + race +
## tenure_days, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1551.0 -634.6 -252.3 338.0 4739.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 840.624051 12.013499 69.973 < 2e-16 ***
## closeness_cen -54.268985 3.522769 -15.405 < 2e-16 ***
## gendermale -7.600628 2.738096 -2.776 0.00551 **
## raceblack -8.696138 7.157424 -1.215 0.22437
## raceHispanic 26.742143 8.680627 3.081 0.00207 **
## raceother 16.020405 54.142223 0.296 0.76731
## racewhite -35.515944 2.904359 -12.228 < 2e-16 ***
## tenure_days 0.122323 0.001984 61.646 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 968.3 on 588079 degrees of freedom
## (1100630 observations deleted due to missingness)
## Multiple R-squared: 0.006878, Adjusted R-squared: 0.006867
## F-statistic: 581.9 on 7 and 588079 DF, p-value: < 2.2e-16
Get the summary of the linear regressions!
model_degree_interaction <- lm(app_proc_time ~ degree_cen * gender + race + tenure_days, data = final_data)
model_betweenness_interaction <- lm(app_proc_time ~ betweenness_cen * gender + race + tenure_days, data = final_data)
model_closeness_interaction <- lm(app_proc_time ~ closeness_cen * gender + race + tenure_days, data = final_data)
summary(model_degree_interaction)
##
## Call:
## lm(formula = app_proc_time ~ degree_cen * gender + race + tenure_days,
## data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1573.7 -662.8 -280.1 327.6 4728.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 773.56395 9.74887 79.349 < 2e-16 ***
## degree_cen 0.05449 0.08375 0.651 0.515280
## gendermale 0.61030 2.51179 0.243 0.808026
## raceblack -56.70939 6.00081 -9.450 < 2e-16 ***
## raceHispanic 64.93067 7.64916 8.489 < 2e-16 ***
## raceother 121.68636 31.55269 3.857 0.000115 ***
## racewhite -6.27995 2.43571 -2.578 0.009929 **
## tenure_days 0.12788 0.00157 81.437 < 2e-16 ***
## degree_cen:gendermale -0.46938 0.09538 -4.921 8.61e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1017 on 906468 degrees of freedom
## (782240 observations deleted due to missingness)
## Multiple R-squared: 0.007795, Adjusted R-squared: 0.007786
## F-statistic: 890.2 on 8 and 906468 DF, p-value: < 2.2e-16
summary(model_betweenness_interaction)
##
## Call:
## lm(formula = app_proc_time ~ betweenness_cen * gender + race +
## tenure_days, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1823.4 -662.3 -280.0 326.7 4727.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.721e+02 9.616e+00 80.289 < 2e-16 ***
## betweenness_cen 9.875e-04 3.439e-04 2.871 0.00409 **
## gendermale -6.493e+00 2.347e+00 -2.766 0.00567 **
## raceblack -5.360e+01 5.999e+00 -8.934 < 2e-16 ***
## raceHispanic 6.756e+01 7.648e+00 8.834 < 2e-16 ***
## raceother 1.281e+02 3.155e+01 4.062 4.87e-05 ***
## racewhite -4.291e+00 2.436e+00 -1.761 0.07816 .
## tenure_days 1.279e-01 1.564e-03 81.766 < 2e-16 ***
## betweenness_cen:gendermale 2.261e-03 3.906e-04 5.789 7.08e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1017 on 906468 degrees of freedom
## (782240 observations deleted due to missingness)
## Multiple R-squared: 0.008051, Adjusted R-squared: 0.008043
## F-statistic: 919.7 on 8 and 906468 DF, p-value: < 2.2e-16
summary(model_closeness_interaction)
##
## Call:
## lm(formula = app_proc_time ~ closeness_cen * gender + race +
## tenure_days, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1579.5 -634.6 -252.5 338.2 4739.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 823.204934 12.207231 67.436 < 2e-16 ***
## closeness_cen -13.385628 6.192129 -2.162 0.03064 *
## gendermale 11.433802 3.621863 3.157 0.00159 **
## raceblack -12.332807 7.171360 -1.720 0.08548 .
## raceHispanic 21.047666 8.709093 2.417 0.01566 *
## raceother 11.174069 54.142668 0.206 0.83649
## racewhite -35.836249 2.904477 -12.338 < 2e-16 ***
## tenure_days 0.123109 0.001987 61.970 < 2e-16 ***
## closeness_cen:gendermale -59.756536 7.443424 -8.028 9.92e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 968.2 on 588078 degrees of freedom
## (1100630 observations deleted due to missingness)
## Multiple R-squared: 0.006987, Adjusted R-squared: 0.006974
## F-statistic: 517.2 on 8 and 588078 DF, p-value: < 2.2e-16
Interpretations:
On an average looking at the linear regression models:
if the race is white application processing time decreases by the most
if the race is hispanic application processing time increases by the most
if gender is male it takes less time than female
longer the tenure more the time taken
and if a male is processing an application of another male then it makes a significant decrease in time